Question 1

a)

vote_lm = lm(trump.vote~metro+non.citizen+unemployed, data=election2016)
summary(vote_lm)
## 
## Call:
## lm(formula = trump.vote ~ metro + non.citizen + unemployed, data = election2016)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.27302 -0.04527 -0.00485  0.06165  0.14827 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.62064    0.06793   9.136 6.65e-12 ***
## metro       -0.18211    0.10118  -1.800   0.0784 .  
## non.citizen -1.25402    0.57049  -2.198   0.0330 *  
## unemployed   1.58516    1.15351   1.374   0.1760    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08011 on 46 degrees of freedom
## Multiple R-squared:  0.4072, Adjusted R-squared:  0.3686 
## F-statistic: 10.53 on 3 and 46 DF,  p-value: 2.157e-05

Seems like non.citizen feature is the only one that is significant based on 0.033 p-value.

b)

reduced_vote_lm = lm(trump.vote~non.citizen, data=election2016)
summary(reduced_vote_lm)
## 
## Call:
## lm(formula = trump.vote ~ non.citizen, data = election2016)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.248506 -0.051852 -0.003506  0.067084  0.145411 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.5979     0.0225  26.569  < 2e-16 ***
## non.citizen  -1.9392     0.3783  -5.126 5.25e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08189 on 48 degrees of freedom
## Multiple R-squared:  0.3537, Adjusted R-squared:  0.3403 
## F-statistic: 26.27 on 1 and 48 DF,  p-value: 5.249e-06
anova(reduced_vote_lm, vote_lm)
## Analysis of Variance Table
## 
## Model 1: trump.vote ~ non.citizen
## Model 2: trump.vote ~ metro + non.citizen + unemployed
##   Res.Df     RSS Df Sum of Sq      F Pr(>F)
## 1     48 0.32188                           
## 2     46 0.29522  2  0.026657 2.0768 0.1369

Since the p value is greater than 0.05, we can conclude that adding other predicters does not improve our model

c)

all_vote_lm = lm(trump.vote~. -state, data=election2016)
summary(all_vote_lm)
## 
## Call:
## lm(formula = trump.vote ~ . - state, data = election2016)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.154253 -0.028060  0.000814  0.031951  0.091114 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.837e+00  6.129e-01   2.998  0.00466 ** 
## median.income -1.003e-06  1.882e-06  -0.533  0.59697    
## unemployed     2.430e-01  9.441e-01   0.257  0.79821    
## metro         -1.660e-01  6.928e-02  -2.397  0.02132 *  
## high.school   -1.049e+00  4.620e-01  -2.271  0.02863 *  
## non.citizen    6.664e-02  4.391e-01   0.152  0.88013    
## white.pov     -7.551e-01  5.425e-01  -1.392  0.17163    
## income.ineq   -4.555e-01  6.465e-01  -0.705  0.48520    
## non.white     -1.916e-01  6.987e-02  -2.741  0.00910 ** 
## outcomeRed     1.201e-01  2.049e-02   5.860 7.44e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04835 on 40 degrees of freedom
## Multiple R-squared:  0.8122, Adjusted R-squared:   0.77 
## F-statistic: 19.22 on 9 and 40 DF,  p-value: 6.64e-12
anova(reduced_vote_lm, all_vote_lm)
## Analysis of Variance Table
## 
## Model 1: trump.vote ~ non.citizen
## Model 2: trump.vote ~ (state + median.income + unemployed + metro + high.school + 
##     non.citizen + white.pov + income.ineq + non.white + outcome) - 
##     state
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1     48 0.32188                                  
## 2     40 0.09352  8   0.22835 12.208 1.234e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Anova test also confirms that the full model is better than the reduced model since we have a very low p-value. In this model high.school and outcomeRed were significant predictors. And we can also observe that R2 value almost doubled compared to previous SLM. This means that the final model performs much better.

d)

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
election_3d = plot_ly(election2016, x=~high.school, y=~outcome, z=~trump.vote)
election_3d %>% add_markers

Seems like with the blue outcome, the high.school numbers are majorly higher than red outcome.

Question 2

a)

gapminder <- read.csv("~/Documents/math133/datasets/gapminder.csv")
filtered_gapminder <- gapminder %>% 
  filter(year == 2007)

head(filtered_gapminder)
##       country year      pop continent  gdpPercap lifeExp
## 1 Afghanistan 2007 31889923      Asia   974.5803  43.828
## 2     Albania 2007  3600523    Europe  5937.0295  76.423
## 3     Algeria 2007 33333216    Africa  6223.3675  72.301
## 4      Angola 2007 12420476    Africa  4797.2313  42.731
## 5   Argentina 2007 40301927  Americas 12779.3796  75.320
## 6   Australia 2007 20434176   Oceania 34435.3674  81.235
library(ggplot2)
ggplot(filtered_gapminder, aes(x = continent, y = lifeExp)) +
  geom_bar(stat = "identity") +
  labs(title = "Bar Chart", x = "continent", y = "lifeExp") +
  theme_minimal()

oceania_out <- filtered_gapminder %>%
  filter(continent != "Oceania")

ggplot(oceania_out, aes(x = continent, y = lifeExp)) +
  geom_bar(stat = "identity") +
  labs(title = "Bar Chart", x = "continent", y = "lifeExp") +
  theme_minimal()

### b)

gapminder_lm = lm(lifeExp~pop+gdpPercap+continent, data=oceania_out)
summary(gapminder_lm)
## 
## Call:
## lm(formula = lifeExp ~ pop + gdpPercap + continent, data = oceania_out)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.8100  -3.0434   0.1563   2.9280  20.0551 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       5.371e+01  9.389e-01  57.205  < 2e-16 ***
## pop               9.697e-10  3.940e-09   0.246    0.806    
## gdpPercap         3.487e-04  5.747e-05   6.068 1.25e-08 ***
## continentAmericas 1.603e+01  1.677e+00   9.553  < 2e-16 ***
## continentAsia     1.256e+01  1.627e+00   7.717 2.44e-12 ***
## continentEurope   1.518e+01  1.974e+00   7.689 2.84e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.62 on 134 degrees of freedom
## Multiple R-squared:  0.7088, Adjusted R-squared:  0.698 
## F-statistic: 65.24 on 5 and 134 DF,  p-value: < 2.2e-16

We can say that gdpPercap is an important mf :) bc the p-value is low. The p-value of pop is very big, so we can say that it can fuck off. Its coefficient is anyways very small. The baseline category for continent is Africa. The features continentAmericas, continentAsia, continentEurope has very low p-values with 1.603e+01, 1.256e+01, 1.518e+01 coefficients respectively. This coefficients indicate how model performs compared to continentAfrica which is the baseline category.

c)

oceania_out %>% ggplot(aes(x=gdpPercap,y=continent))+geom_point(shape=1)

we can say that in the order Africa, Americas, Asia and Europe, GDP of values grow.

comparing reduced model to full model

gapminder_reduced_lm = lm(lifeExp~gdpPercap+continent, data=oceania_out)
summary(gapminder_reduced_lm)
## 
## Call:
## lm(formula = lifeExp ~ gdpPercap + continent, data = oceania_out)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.9058  -2.8987   0.1394   2.9136  20.0445 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       5.373e+01  9.317e-01  57.674  < 2e-16 ***
## gdpPercap         3.474e-04  5.703e-05   6.092 1.09e-08 ***
## continentAmericas 1.605e+01  1.668e+00   9.625  < 2e-16 ***
## continentAsia     1.266e+01  1.563e+00   8.103 2.85e-13 ***
## continentEurope   1.521e+01  1.964e+00   7.746 2.02e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.597 on 135 degrees of freedom
## Multiple R-squared:  0.7087, Adjusted R-squared:  0.7001 
## F-statistic: 82.11 on 4 and 135 DF,  p-value: < 2.2e-16
anova(gapminder_reduced_lm, gapminder_lm)
## Analysis of Variance Table
## 
## Model 1: lifeExp ~ gdpPercap + continent
## Model 2: lifeExp ~ pop + gdpPercap + continent
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    135 5875.6                           
## 2    134 5872.9  1     2.655 0.0606  0.806

added the multiplicative effect, compared to reduced model

gapminder_lm_multiplicative = lm(lifeExp~gdpPercap*continent, data=oceania_out)
summary(gapminder_lm)
## 
## Call:
## lm(formula = lifeExp ~ pop + gdpPercap + continent, data = oceania_out)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.8100  -3.0434   0.1563   2.9280  20.0551 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       5.371e+01  9.389e-01  57.205  < 2e-16 ***
## pop               9.697e-10  3.940e-09   0.246    0.806    
## gdpPercap         3.487e-04  5.747e-05   6.068 1.25e-08 ***
## continentAmericas 1.603e+01  1.677e+00   9.553  < 2e-16 ***
## continentAsia     1.256e+01  1.627e+00   7.717 2.44e-12 ***
## continentEurope   1.518e+01  1.974e+00   7.689 2.84e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.62 on 134 degrees of freedom
## Multiple R-squared:  0.7088, Adjusted R-squared:  0.698 
## F-statistic: 65.24 on 5 and 134 DF,  p-value: < 2.2e-16
anova(gapminder_lm_multiplicative, gapminder_reduced_lm)
## Analysis of Variance Table
## 
## Model 1: lifeExp ~ gdpPercap * continent
## Model 2: lifeExp ~ gdpPercap + continent
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1    132 5474.8                              
## 2    135 5875.6 -3   -400.83 3.2214 0.02485 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Seems like adding the multiplication of gdpPercap and continent improves the reduced model slightly.

d)

ggplot(gapminder, aes(x = gdpPercap, y = lifeExp, color = continent)) +
  geom_point(alpha = 0.6) +  # Adds scatter points with slight transparency
  labs(
    title = "Life Expectancy vs GDP per Capita",
    x = "GDP per Capita (log scale)",
    y = "Life Expectancy",
    color = "Continent"
  ) +
  theme_minimal()

we see that as life expectancy goes higher, in continents where GDP is higher accordingly.

ggplot(gapminder, aes(x = log(gdpPercap), y = lifeExp, color = continent)) +
  geom_point(alpha = 0.6) +  # Adds scatter points with slight transparency
  labs(
    title = "Life Expectancy vs GDP per Capita",
    x = "GDP per Capita (log scale)",
    y = "Life Expectancy",
    color = "Continent"
  ) +
  theme_minimal()

Yes, we still see that there is a growth in life expectancy in the continents with higher GDP